In [26]:
from __future__ import division
from collections import Counter, defaultdict
import math
from functools import partial

In [18]:
def entropy(class_probabilities):
    """given list of class probs p_i, compute entropy"""
    return sum(-p * math.log(p, 2)
              for p in class_probabilities
              if p)

def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count
           for count in Counter(labels).values()]

def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)
    
# partition entropy is the weighted sum of each individual entropy
def partition_entropy(subsets):
    """computes the entropy across all subsets (list of lists)"""
    total_count = sum(len(subset) for subset in subsets)
    
    return sum(len(subset) * data_entropy(subset) / total_count
              for subset in subsets)

In [20]:
inputs = [
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
    ]

ID3

  1. If all data has same label make a leaf node with that label and stop
  2. If list of attributes is empy make a leaf node with mode of labels and stop
  3. Otherwise, partition by all attributes
  4. Choose partition with lowest entropy
  5. Add a decision node based on that attribute
  6. Repeat with other attributes

In [21]:
def partition_by(inputs, attribute):
    """input is pair (attr_dict, label)
    returns dict: attr_value -> inputs"""
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute] # key on value of given attribute
        groups[key].append(input) # add the input to this list
    return groups

def partition_entropy_by(inputs, attribute):
    """computes total entropy for a given attribute partitioning"""
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

In [22]:
for key in ['level','lang','tweets','phd']:
    print key, partition_entropy_by(inputs, key)


level 0.693536138896
lang 0.860131712855
tweets 0.788450457308
phd 0.892158928262

In [24]:
# split on level
senior_inputs = [(input, label)
                for input, label in inputs if input['level'] == 'Senior']

for key in ['lang','tweets','phd']:
    print key, partition_entropy_by(senior_inputs, key)


 lang 0.4
tweets 0.0
phd 0.950977500433

In [29]:
def classify(tree, input):
    """classify the input using the given decision tree"""
    
    # if it's a leaf node just spit out value
    if tree in [True, False]:
        return tree
    
    # otherwise this has an attr to split
    attribute, subtree_dict = tree
    
    subtree_key = input.get(attribute) # None if missing
    
    if subtree_key not in subtree_dict: # no tree for this key
        subtree_key = None
        
    subtree = subtree_dict[subtree_key] # get the right subtree
    
    return classify(subtree, input)


def build_tree_id3(inputs, split_candidates = None):
    
    if split_candidates is None: # first time through
        split_candidates = inputs[0][0].keys() # all keys are possible
        
    # Find all the True False
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    # Step 1) if pure then return label
    if num_trues == 0:
        return False
    if num_falses == 0:
        return True
    
    # Step 2) if we have no candidates left return majority vote
    if not split_candidates:
        return num_trues >= num_falses
    
    # Step 3) start recurse
    # find best split
    best_attribute = min(split_candidates,
                        key = partial(partition_entropy_by, inputs))
    
    # split on that
    partitions = partition_by(inputs, best_attribute)
    # remove this attr from the candidates
    new_candidates = [a for a in split_candidates
                     if a != best_attribute]
    
    # recurse
    subtrees = {attribute_value : build_tree_id3(subset, new_candidates)
               for attribute_value, subset in partitions.iteritems()}
    
    subtrees[None] = num_trues > num_falses # default case of missing value
    
    return (best_attribute, subtrees)

In [30]:
tree = build_tree_id3(inputs)

In [31]:
tree


Out[31]:
('level',
 {None: True,
  'Junior': ('phd', {None: True, 'no': True, 'yes': False}),
  'Mid': True,
  'Senior': ('tweets', {None: False, 'no': False, 'yes': True})})

In [32]:
classify(tree, {'level' : 'Junior',
               'lang' : 'Java',
               'tweets' : 'yes',
               'phd' : 'no'})


Out[32]:
True

Random Forest

  • Instead of training on all data you train on a bootstrap sample
  • Instead of choosing all attributes to split you choose a random subset

In [ ]: